In [1]:
import gc
import geopandas as gpd
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from itables import show
from sklearn.cluster import DBSCAN
from tslearn.clustering import TimeSeriesKMeans
%matplotlib inline
%config InlineBackend.figure_formats = ['retina']
pd.set_option("mode.copy_on_write", True)
Sample¶
In [2]:
df = pd.read_parquet("data/nyc/yellow_tripdata_2024-03.parquet")
In [3]:
gdf = gpd.read_file("zip://data/nyc/taxi_zones.zip")
In [4]:
show(gdf, scrollX=True)
| OBJECTID | Shape_Leng | Shape_Area | zone | LocationID | borough | geometry |
|---|---|---|---|---|---|---|
|
Loading ITables v2.1.0 from the internet...
(need help?) |
Explore¶
In [5]:
gdf.borough.unique()
Out[5]:
array(['EWR', 'Queens', 'Bronx', 'Manhattan', 'Staten Island', 'Brooklyn'],
dtype=object)
In [6]:
def plot_borough():
fig = plt.figure(figsize=(10, 10))
ax = fig.gca()
gdf.plot(ax=ax)
for borough in gdf.borough.unique():
gdf_subset = gdf[gdf["borough"] == borough]
centroid = gdf_subset["geometry"].centroid
centroid_x = centroid.x.mean()
centroid_y = centroid.y.mean()
ax.text(
centroid_x,
centroid_y,
borough,
color="white",
ha="center",
va="center",
)
plt.show()
In [7]:
plot_borough()
In [8]:
def plot_zones(borough: str, figsize: tuple):
fig = plt.figure(figsize=figsize)
ax = fig.gca()
if borough == "all":
gdf_subset = gdf[gdf["borough"] != "Manhattan"]
else:
gdf_subset = gdf[gdf["borough"] == borough]
gdf_subset.plot(ax=ax)
for t in gdf_subset.itertuples():
centroid = t.geometry.centroid
ax.text(
centroid.x,
centroid.y,
t.OBJECTID,
fontsize=8,
color="white",
ha="center",
va="center",
)
plt.show()
In [9]:
plot_zones("all", (12, 12))
plot_zones("Manhattan", (12, 12))
Modify¶
Rename¶
In [10]:
df_renamed = df.rename(
{
"VendorID": "vendor_id",
"RatecodeID": "rate_code_id",
"PULocationID": "pick_up_location_id",
"DOLocationID": "drop_off_location_id",
"Airport_fee": "airport_fee",
},
axis=1,
)
Remove Duplicates¶
In [11]:
df_deduped = df_renamed.drop_duplicates()
len(df_renamed) - len(df_deduped)
Out[11]:
0
Filter Examples¶
In [12]:
start_date_mask = df_deduped["tpep_pickup_datetime"] >= "2024-03-01"
end_date_mask = df_deduped["tpep_pickup_datetime"] < "2024-04-01"
df_filtered = df_renamed[start_date_mask & end_date_mask]
len(df_deduped) - len(df_filtered)
Out[12]:
23
Integrate Centroids¶
In [13]:
gdf["geometry_x"] = gdf["geometry"].apply(lambda x: x.centroid.x)
gdf["geometry_y"] = gdf["geometry"].apply(lambda x: x.centroid.y)
In [14]:
df_pu = (
df_filtered.merge(
right=gdf[["OBJECTID", "geometry_x", "geometry_y"]],
left_on="pick_up_location_id",
right_on="OBJECTID",
)
.drop("OBJECTID", axis=1)
.rename(mapper={"geometry_x": "pick_up_x", "geometry_y": "pick_up_y"}, axis=1)
)
In [15]:
df_do = (
df_pu.merge(
right=gdf[["OBJECTID", "geometry_x", "geometry_y"]],
left_on="drop_off_location_id",
right_on="OBJECTID",
)
.drop("OBJECTID", axis=1)
.rename(mapper={"geometry_x": "drop_off_x", "geometry_y": "drop_off_y"}, axis=1)
)
In [16]:
len(df_filtered) - len(df_do)
Out[16]:
35374
In [17]:
show(df_do.iloc[:300], scrollX=True)
| vendor_id | tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | rate_code_id | store_and_fwd_flag | pick_up_location_id | drop_off_location_id | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | congestion_surcharge | airport_fee | pick_up_x | pick_up_y | drop_off_x | drop_off_y |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
Loading ITables v2.1.0 from the internet...
(need help?) |
In [18]:
# df_do.to_parquet("data/nyc/yellow_tripdata_2024-03.uc.parquet", compression=None)
In [19]:
del df
del df_renamed
del df_deduped
del df_filtered
del df_pu
gc.collect()
Out[19]:
25263
Model¶
Time-Series Clustering¶
In [20]:
df_ts = df_do.set_index("tpep_pickup_datetime")
n_passengers = df_ts["passenger_count"].resample("h").sum().to_frame()
In [21]:
n_passengers["date"] = n_passengers.index.strftime("%Y-%m-%d")
n_passengers["time"] = n_passengers.index.strftime("%H:%M")
n_passengers_daily = n_passengers.pivot(
index="date",
columns="time",
values="passenger_count",
)
n_passengers_daily["day_name"] = n_passengers_daily.index.map(
lambda x: pd.to_datetime(x).day_name()
)
n_passengers_daily_inp = n_passengers_daily.iloc[:, :24]
K-Means DTW¶
In [22]:
wcss = []
for i in range(1, 11):
kmeans_ts = TimeSeriesKMeans(n_clusters=i, metric="dtw", random_state=12345)
kmeans_ts.fit(n_passengers_daily_inp)
wcss.append(kmeans_ts.inertia_)
In [23]:
plt.figure(figsize=(6, 3))
plt.plot(range(1, 11), wcss)
plt.ylabel("Within Cluster Sum of Squares")
plt.xlabel("Clusters")
plt.xticks(range(1, 11))
plt.show()
In [24]:
kmeans_ts = TimeSeriesKMeans(n_clusters=3, metric="dtw", random_state=12345)
n_passengers_daily["kmeans_clusters"] = kmeans_ts.fit_predict(n_passengers_daily_inp)
DBSCAN¶
In [25]:
dbscan_ts = DBSCAN(eps=3500, min_samples=5)
n_passengers_daily["dbscan_clusters"] = dbscan_ts.fit_predict(n_passengers_daily_inp)
Results¶
In [26]:
show(n_passengers_daily, scrollX=True)
| time | 00:00 | 01:00 | 02:00 | 03:00 | 04:00 | 05:00 | 06:00 | 07:00 | 08:00 | 09:00 | 10:00 | 11:00 | 12:00 | 13:00 | 14:00 | 15:00 | 16:00 | 17:00 | 18:00 | 19:00 | 20:00 | 21:00 | 22:00 | 23:00 | day_name | kmeans_clusters | dbscan_clusters |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||||||||||||||||||
|
Loading ITables v2.1.0 from the internet...
(need help?) |
In [27]:
def plot_clusters_ts(algorithm: str):
col_name = f"{algorithm}_clusters"
plt.figure(figsize=(10, 6))
colors = plt.cm.tab10(np.arange(10))
anomalous_idx = n_passengers_daily[col_name].max() + 1
for i, day in enumerate(n_passengers_daily.index):
cluster = n_passengers_daily[col_name].iloc[i]
if cluster == -1:
color = anomalous_idx
label = "Noises"
else:
label = f"Cluster {cluster}"
color = cluster
plt.plot(
n_passengers_daily.columns[:24],
n_passengers_daily.iloc[:, :24].loc[day],
color=colors[color],
label=label,
)
handles, labels = plt.gca().get_legend_handles_labels()
unique_labels = list(set(labels))
unique_handles = [handles[labels.index(label)] for label in unique_labels]
plt.ylabel("Passenger Count")
plt.xlabel("Hours (March 2024)")
plt.xticks(rotation=45)
handles, labels = plt.gca().get_legend_handles_labels()
unique_labels = sorted(list(set(labels)))
unique_handles = [handles[labels.index(label)] for label in unique_labels]
plt.legend(unique_handles, unique_labels)
plt.show()
In [28]:
plot_clusters_ts(algorithm="kmeans")
In [29]:
plot_clusters_ts(algorithm="dbscan")
In [30]:
def tabulate_clusters_ts(algorithm: str):
n_passengers_daily_agg = (
n_passengers_daily.reset_index()
.groupby(f"{algorithm}_clusters")
.agg({"date": list, "day_name": lambda x: list(set(x))})
)
show(n_passengers_daily_agg, scrollX=True)
In [31]:
tabulate_clusters_ts(algorithm="kmeans")
| time | date | day_name |
|---|---|---|
| kmeans_clusters | ||
|
Loading ITables v2.1.0 from the internet...
(need help?) |
In [32]:
tabulate_clusters_ts(algorithm="dbscan")
| time | date | day_name |
|---|---|---|
| dbscan_clusters | ||
|
Loading ITables v2.1.0 from the internet...
(need help?) |
Anomaly Detection in Time Series¶
In [33]:
df_tp = df_do.set_index("tpep_pickup_datetime")
n_passengers_min = df_tp["passenger_count"].resample("min").sum().to_frame()
n_passengers_min["day_name"] = n_passengers_min.index.map(lambda x: x.day_name())
DBSCAN¶
In [34]:
dbscan_tp = DBSCAN(eps=1, min_samples=50)
n_passengers_min["clusters"] = dbscan_tp.fit_predict(n_passengers_min.iloc[:, [0]])
Results¶
In [35]:
n_passengers_min_noises = n_passengers_min[n_passengers_min["clusters"] == -1]
show(n_passengers_min_noises, scrollX=True)
| passenger_count | day_name | clusters | |
|---|---|---|---|
| tpep_pickup_datetime | |||
|
Loading ITables v2.1.0 from the internet...
(need help?) |
In [36]:
plt.figure(figsize=(12, 8))
for cluster in n_passengers_min["clusters"].unique():
cluster_df = n_passengers_min[n_passengers_min["clusters"] == cluster]
if cluster == -1:
label = "Anomalous"
else:
label = "Normal Points"
plt.scatter(cluster_df.index, cluster_df["passenger_count"], label=label)
plt.ylabel("Passenger Count")
plt.xlabel("Days (March 2024)")
plt.xticks(pd.date_range("2024-03-01", "2024-04-01"), rotation=45)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%m-%d"))
plt.legend()
plt.show()
In [37]:
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
days_ord = {day: i for i, day in enumerate(days)}
n_passengers_min_noises_days = n_passengers_min_noises.groupby("day_name").size()
n_passengers_min_noises_days = n_passengers_min_noises_days.to_frame()
n_passengers_min_noises_days["order"] = n_passengers_min_noises_days.index.map(days_ord)
n_passengers_min_noises_days.sort_values(by="order", inplace=True)
show(n_passengers_min_noises_days, scrollX=True)
| 0 | order | |
|---|---|---|
| day_name | ||
|
Loading ITables v2.1.0 from the internet...
(need help?) |
In [38]:
plt.figure(figsize=(6, 3))
plt.bar(n_passengers_min_noises_days.index, n_passengers_min_noises_days[0])
plt.ylabel("Count")
plt.xlabel("Days of the Week")
plt.show()
In [39]:
hours = n_passengers_min_noises.index.hour
n_passengers_min_noises_hours = n_passengers_min_noises.groupby(hours).size()
show(n_passengers_min_noises_hours, scrollX=True)
| 0 | |
|---|---|
| tpep_pickup_datetime | |
|
Loading ITables v2.1.0 from the internet...
(need help?) |
In [40]:
plt.figure(figsize=(6, 3))
plt.bar(n_passengers_min_noises_hours.index, n_passengers_min_noises_hours)
plt.ylabel("Count")
plt.xlabel("Hours of a Day")
plt.xticks(n_passengers_min_noises_hours.index)
plt.show()
Hotspot Detection¶
In [41]:
n_trips_pu = df_do[df_do["tpep_pickup_datetime"].dt.hour == 18]
In [42]:
def stratified_sample_per_day(group, proportion=0.2):
n_samples = max(1, int(len(group) * proportion))
return group.sample(n=n_samples, random_state=12345)
In [43]:
n_trips_pu_sampled = (
n_trips_pu.groupby(n_trips_pu["tpep_pickup_datetime"].dt.date)
.apply(stratified_sample_per_day)
.reset_index(drop=True)
)
n_trips_pu_inp = n_trips_pu_sampled[["pick_up_x", "pick_up_y"]]
DBSCAN¶
In [44]:
dbscan_pu = DBSCAN(eps=3850, min_samples=50)
n_trips_pu_sampled["clusters_pu"] = dbscan_pu.fit_predict(n_trips_pu_inp)
Results¶
In [45]:
show(n_trips_pu_sampled.iloc[:300], scrollX=True)
| vendor_id | tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | rate_code_id | store_and_fwd_flag | pick_up_location_id | drop_off_location_id | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | congestion_surcharge | airport_fee | pick_up_x | pick_up_y | drop_off_x | drop_off_y | clusters_pu |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
Loading ITables v2.1.0 from the internet...
(need help?) |
In [46]:
fig = plt.figure(figsize=(10, 10))
ax = fig.gca()
anomalous_idx = n_trips_pu_sampled["clusters_pu"].max() + 1
gdf.plot(ax=ax, alpha=0.1)
for cluster in np.sort(n_trips_pu_sampled["clusters_pu"].unique()):
if cluster == -1:
label = "Noises"
else:
label = f"Cluster {cluster}"
points = n_trips_pu_sampled[n_trips_pu_sampled["clusters_pu"] == cluster]
ax.scatter(points["pick_up_x"], points["pick_up_y"], label=label, marker="x")
plt.legend()
plt.show()
In [47]:
n_trips_clusters = n_trips_pu_sampled["clusters_pu"].value_counts().sort_index()
show(n_trips_clusters, scrollX=True)
| count | |
|---|---|
| clusters_pu | |
|
Loading ITables v2.1.0 from the internet...
(need help?) |
In [48]:
plt.figure(figsize=(6, 3))
plt.bar(n_trips_clusters.index, n_trips_clusters)
plt.ylabel("Number of Trips")
plt.yscale("log")
plt.xlabel("Clusters")
plt.xticks(n_trips_clusters.index)
plt.show()
In [ ]: